The model and data at healthdata.org includes start and stop dates for various categories of lockdown measures (e.g. closing schools, any transit restrictions, closures of non-essential businesses) as well as estimates of current cases for many countries (and some sub-country regions). If we approximate the dymanics as exponential processes (like the SIR family of models but without saturation) we can estimate the current rate of spread and correlate that with current lockdown measures.

Some future possibilities:

  • we could use the google mobility data or healthdata's aggregate of that mobility data to identify countermeasures that are effective beyond what we would predict from the change in mobility alone
In [1]:
from itertools import permutations, product

import holoviews as hv
import numpy as np
import pandas as pd
import pymc3 as pm
In [2]:
hv.notebook_extension('bokeh')
%opts Overlay [aspect=5/3, responsive=True]
In [3]:
healthdata = pd.read_csv('2020_05_28/Hospitalization_all_locs.csv', parse_dates=['date'])

stats = pd.read_csv('2020_05_23.03/Summary_stats_all_locs.csv', parse_dates=[
    'peak_bed_day_mean', 'peak_bed_day_lower',
    'peak_bed_day_upper', 'peak_icu_bed_day_mean', 'peak_icu_bed_day_lower',
    'peak_icu_bed_day_upper', 'peak_vent_day_mean', 'peak_vent_day_lower',
    'peak_vent_day_upper', 'travel_limit_start_date', 'travel_limit_end_date',
    'stay_home_start_date', 'stay_home_end_date',
    'educational_fac_start_date', 'educational_fac_end_date',
    'any_gathering_restrict_start_date', 'any_gathering_restrict_end_date',
    'any_business_start_date', 'any_business_end_date',
    'all_non-ess_business_start_date', 'all_non-ess_business_end_date',
])
In [4]:
# We will melt the start and end dates and drop the missing data so we can determine
# whether or not a policy was in place using an asof merge to the most recent change
countermeasures = stats.melt(
    id_vars=['location_name'],
    value_vars=[c for c in stats.columns if 'date' in c],
    value_name='date'
).dropna()
countermeasures['type'] = countermeasures['variable'].apply(
    lambda name: '_'.join(name.replace('-', '_').split('_')[:-2])
)
countermeasure_types = sorted(set(countermeasures['type']))
countermeasures['enforced'] = countermeasures['variable'].apply(lambda name: name.split('_')[-2] == 'start')
countermeasures.drop(columns=['variable'], inplace=True)
countermeasures.sort_values('date', inplace=True)
countermeasure_types
Out[4]:
['all_non_ess_business',
 'any_business',
 'any_gathering_restrict',
 'educational_fac',
 'stay_home',
 'travel_limit']
In [5]:
merged = healthdata.copy()

# healthdata uses projections for the past two weeks just as they do for future dates
# as a mitigation for lag in reporting. To look only at estimates for days with data,
# discard anything newer than two weeks before the latest data.
merged.dropna(subset=['confirmed_infections'], inplace=True)
merged = merged.loc[merged['date'] < merged['date'].max() - pd.to_timedelta('2w')]

# Regions that are decomposed are stored here, too, but without countermeasure data.
# Therefore we discard the groups and keep their subdivisions
group_names = [
    'Brazil',
    'Canada',
    'Germany',
    'Italy',
    'Mexico',
    'Spain',
    'United States of America',
]
merged = merged.loc[~merged['location_name'].apply(group_names.__contains__)]

# to reduce noise, only investigate growth once a location has at least N infections
N = 200
merged = merged.loc[merged['est_infections_mean'] > N]

# Drop locations with fewer than N samples
enough = merged.groupby('location_name')['V1'].count() >= 40
merged = merged.merge(enough, left_on='location_name', right_index=True)
merged = merged.loc[merged['V1_y']]

merged.sort_values('date', inplace=True)

for countermeasure_type, subset in countermeasures.groupby('type'):
    subset = subset.drop(columns='type').rename(columns={'enforced': countermeasure_type})
    merged = pd.merge_asof(merged, subset, on='date', by='location_name')
    merged[countermeasure_type].fillna(False, inplace=True)

cols = ['est_infections_mean', 'est_infections_lower', 'est_infections_upper']
diffs = merged.groupby('location_name')[cols].diff().rename(columns={c: f'diff_{c}' for c in cols})
merged = pd.concat([merged, diffs], axis=1)

merged['fractional_diff_est_infections_mean'] = merged['diff_est_infections_mean'] / merged['est_infections_mean']

merged['day_of_year'] = merged['date'].dt.dayofyear
merged['day_of_week'] = merged['date'].dt.dayofweek
merged['random'] = np.random.normal(size=len(merged))

print(len(healthdata), len(merged))
49115 9139
In [6]:
merged['num_active'] = 0
for countermeasure_type in countermeasure_types:
    merged['num_active'] += merged[countermeasure_type]

(
    hv.Scatter(merged, 'date', ['fractional_diff_est_infections_mean', 'num_active'])
    .options(alpha=0.3, jitter=5e7, legend_position='right', aspect=5/3, responsive=True,
             colorbar=True, cmap='viridis', color=hv.dim('num_active'))
)
Out[6]:
In [7]:
hv.HoloMap({
    countermeasure_type:
    hv.Scatter(
        merged.loc[~merged[countermeasure_type]],
        'date', ['fractional_diff_est_infections_mean', 'location_name'] + countermeasure_types,
        label='False'
    ).options(alpha=0.3, jitter=5e7, aspect=5/3, responsive=True, tools=['hover']) *
    hv.Scatter(
        merged.loc[merged[countermeasure_type]],
        'date', ['fractional_diff_est_infections_mean', 'location_name'] + countermeasure_types,
        label='True'
    ).options(alpha=0.3, jitter=5e7, tools=['hover'])
    for countermeasure_type in countermeasure_types
}, 'Countermeasure').layout().cols(1)
Out[7]:
In [8]:
X = merged.copy()
X['random'] = np.random.uniform(-1, 1, size=len(X))
X.sort_values('random', inplace=True)

plots = {}
for a, b in product(countermeasure_types, countermeasure_types):
    x = X.copy()
    x[b] += 0.4 * np.tanh(5.0 * x['fractional_diff_est_infections_mean'])
    if a != b:
        x[a] += 0.36 * x['random']
    plots[(a, b)] = (
        hv.Scatter(x, [a, b], 'fractional_diff_est_infections_mean')
        .options(color=hv.dim('fractional_diff_est_infections_mean'), alpha=0.05, cmap='viridis')
    )

(
    hv.GridSpace(plots, sort=True)
    .options(xrotation=45, yrotation=45)
    .redim.range(fractional_diff_est_infections_mean=(-0.1, 0.3))
)
Out[8]:
In [9]:
def plot_trace(trace, varnames=None):
    """Plot the distribution and trace for each latent variable in a pymc trace object.
    
    trace: the trace output from pymc.sample
    varnames: Optional specification of variables to include in the trace plot. If None, use all variables not ending with '_'
    """
    plots = []
    for var in varnames or [var for var in trace.varnames if not var.endswith('_')]:
        x = trace.get_values(var, combine=False)
        if not isinstance(x, list):
            x = [x]
        plots.append(hv.Overlay([hv.Distribution(xi, [var], [f'p({var})']) for xi in x], group=var).options(aspect=3))
        plots.append(hv.Overlay([hv.Curve(xi, 'index', var).options(alpha=0.6) for xi in x]).options(aspect=3))
    return hv.Layout(plots).cols(2)
In [10]:
# A stay at home order is a gathering restriction
# A shutdown of non-essential businesses is a business restriction
# For more interpretable results, separate their effects

x = merged[[
    'fractional_diff_est_infections_mean',
    'date',
    'day_of_week',
    'location_name'
] + countermeasure_types].dropna()
location_indices, location_names = pd.factorize(x['location_name'])

pairs = set(tuple(sorted([a, b])) for a, b in permutations(countermeasure_types, 2))

with pm.Model() as model:
    
    rate = pm.Normal('baseline_rate', mu=0.4, sigma=0.2, shape=len(location_names))[location_indices]
    
    # Model the weekly periodicity
    amplitude = pm.HalfNormal('amplitude', sigma=0.4, shape=len(location_names))[location_indices]
    phase0 = pm.Uniform('phase0', 0, 7, shape=len(location_names))[location_indices]
    rate += amplitude * np.cos(2 * np.pi * (np.asarray(x['day_of_week']) - phase0) / 7)
    
    rate += pm.Normal('educational_fac', mu=0, sigma=0.4) * np.asarray(x['educational_fac'])
    rate += pm.Normal('travel_limit', mu=0, sigma=0.4) * np.asarray(x['travel_limit'])
    rate += pm.Normal('all_non_ess_business', mu=0, sigma=0.4) * np.asarray(x['all_non_ess_business'])
    rate += pm.Normal('stay_home', mu=0, sigma=0.4) * np.asarray(x['stay_home'])
    rate += pm.Normal('any_gathering_restrict', mu=0, sigma=0.4) * np.asarray(x['any_gathering_restrict'] & ~x['stay_home'])
    rate += pm.Normal('any_business', mu=0, sigma=0.4) * np.asarray(x['any_business'] & ~x['all_non_ess_business'])
    
    # Model effect of both stay_home and all_non_ess_business
    # Should be positive since we expect both to be not quite as good as the sum of each measure separately
    rate += pm.Normal('stay_home&all_non_ess_business', mu=0, sigma=0.4) * np.asarray(x['stay_home'] & x['all_non_ess_business'])
    
    pm.Deterministic('rate', rate)
    
    err = pm.HalfNormal('err', sigma=0.4, shape=len(location_names))[location_indices]
    pm.Normal('observations', mu=rate, sigma=err,
              observed=np.asarray(x['fractional_diff_est_infections_mean']))
    
    trace = pm.sample(3_000, tune=1_000, chains=4, cores=4)
    
plot_trace(trace, varnames=countermeasure_types + ['stay_home&all_non_ess_business'])
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [err, stay_home&all_non_ess_business, any_business, any_gathering_restrict, stay_home, all_non_ess_business, travel_limit, educational_fac, phase0, amplitude, baseline_rate]
Sampling 4 chains, 0 divergences: 100%|██████████| 16000/16000 [05:45<00:00, 46.25draws/s]
The number of effective samples is smaller than 10% for some parameters.
Out[10]:
In [11]:
hv.Overlay([
    hv.Distribution(trace[countermeasure_type], label=countermeasure_type)
    for countermeasure_type in countermeasure_types + ['stay_home&all_non_ess_business']
]).options(aspect=5/3, responsive=True)
Out[11]:
In [12]:
samples = pd.DataFrame(list(zip(
    trace['baseline_rate'].ravel(),
    trace['amplitude'].ravel(),
    trace['phase0'].ravel(),
    trace['err'].ravel(),
    np.array(location_names)[None, :].repeat(4000, axis=0).ravel()
)), columns=['baseline_rate', 'amplitude', 'phase0', 'err', 'location'])

samples = samples.merge(samples.groupby('location')['baseline_rate'].mean(), on='location', suffixes=('', '_mean'))
samples.sort_values('baseline_rate_mean', inplace=True)

hv.HoloMap({
    variable: hv.Violin(samples, 'location', variable)
    .options(aspect=5/3, responsive=True, xrotation=90)
    for variable in ['baseline_rate', 'amplitude', 'phase0', 'err']
}).layout().cols(1)
Out[12]:
In [13]:
percentile = np.linspace(0, 100, 1003)[1:-1]
tail_percentile = 100 - abs(percentile * 2 - 100)
    
def plot_rate(location):
    idx = location_names.get_loc(location)
    these = location_indices == idx
    rate = trace['rate'][:, these]
    Y = np.percentile(rate, percentile, axis=0)
    
    t = x.loc[these, 'date']
    df = [{'t': t, 'rate': y, 'P': p} for y, p in zip(Y, tail_percentile)]
    return (
        hv.Contours(df, ['t', 'rate'], 'P')
        .options(cmap='plasma', colorbar=True, show_legend=False, line_width=2, logz=True, cformatter='%.2g%%') *
        hv.Curve(merged.loc[merged['location_name'] == location], 'date', 'fractional_diff_est_infections_mean')
    ).options(aspect=5/3, responsive=True)
In [14]:
plot_rate('Lombardia')
Out[14]:
In [15]:
plot_rate('Veneto')
Out[15]:
In [16]:
plot_rate('Virginia')
Out[16]:
In [17]:
# hv.DynamicMap(plot_rate, kdims=['Location']).redim.values(Location=sorted(location_names))
In [ ]: